{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "KG_COVID_19_Data_File_Generator.ipynb", "provenance": [], "machine_shape": "hm" }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "OKBsas4jb_5D" }, "source": [ "## Download and preprocess the Dataset into files which can be loaded effectively in the memory" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "t-_csND5cC9s", "outputId": "8906f326-dff8-44aa-c03e-f8dfcb856631" }, "source": [ "!wget https://kg-hub.berkeleybop.io/kg-covid-19/current/kg-covid-19.tar.gz\r\n", "!tar -xf kg-covid-19.tar.gz\r\n", "!ls" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "--2020-12-10 02:18:27-- https://kg-hub.berkeleybop.io/kg-covid-19/current/kg-covid-19.tar.gz\n", "Resolving kg-hub.berkeleybop.io (kg-hub.berkeleybop.io)... 54.192.86.123, 54.192.86.55, 54.192.86.74, ...\n", "Connecting to kg-hub.berkeleybop.io (kg-hub.berkeleybop.io)|54.192.86.123|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 376275636 (359M) [application/gzip]\n", "Saving to: ‘kg-covid-19.tar.gz.1’\n", "\n", "kg-covid-19.tar.gz. 100%[===================>] 358.84M 171MB/s in 2.1s \n", "\n", "2020-12-10 02:18:29 (171 MB/s) - ‘kg-covid-19.tar.gz.1’ saved [376275636/376275636]\n", "\n", "kg-covid-19.tar.gz merged-kg_edges.tsv sample_data\n", "kg-covid-19.tar.gz.1 merged-kg_nodes.tsv\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "ExYp9rPycXMf" }, "source": [ "!head -10000000 merged-kg_edges.tsv > merged-kg_edges_head.tsv\r\n", "!tail -n +10000000 merged-kg_edges.tsv > merged-kg_edges_tail.tsv" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "wbNuhE4jcdOO" }, "source": [ "## Import packages and set edge columns" ] }, { "cell_type": "code", "metadata": { "id": "V3X8ZlDscYRg" }, "source": [ "import pandas as pd\r\n", "import json\r\n", "pd.set_option('display.max_rows', 50)\r\n", "\r\n", "columns = ['id', 'subject', 'edge_label', 'object', 'relation', 'provided_by', 'target_type', 'standard_units', 'subjectActivity', 'uo_units', 'assay_organism', 'neighborhood', 'assay', 'textmining', 'type', 'ECO_code', 'coexpression', 'cooccurence', 'edge_key', 'coexpression_transferred', 'experiments_transferred', 'standard_value', 'target_organism', 'publication', 'comment', 'Annotation_Properties', 'evidence', 'textmining_transferred', 'neighborhood_transferred', 'fusion', 'database_transferred', 'database', 'subj_exp_role', 'Assigned_by', 'num_participants', 'standard_relation', 'DB_References', 'combined_score', 'obj_exp_role', 'detection_method', 'experiments', 'objectActivity', 'standard_type', 'With', 'Date', 'homology', 'association_type', 'publications', 'target_pref_name']" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "hW2ljFZrcjXW" }, "source": [ "## Load Nodes and Edges into dataframes" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SyEtnVQocivq", "outputId": "66004cd5-c88d-4a69-f24a-f6f32ab56e21" }, "source": [ "kg_nodes_df = pd.read_csv('merged-kg_nodes.tsv', sep='\\t', header=0, dtype={\"id\":\"string\",\t\"name\":\"string\",\t\"category\":\"string\",\t\"provided_by\":\"string\",\t\"TTD_ID\":\"string\",\t\"iri\":\"string\",\t\"doi\":\"string\",\t\"title\":\"string\",\t\"same_as\":\"string\",\t\"tissue_chembl_id\":\"string\",\t\"TDL\":\"string\",\t\"description\":\"string\",\t\"in_taxon\tassay_type\":\"string\",\t\"bao_format\":\"string\",\t\"confidence_score\":\"string\",\t\"molecular_formula\":\"string\",\t\"synonym\":\"string\",\t\"assay_chembl_id\":\"string\",\t\"canonical_smiles\":\"string\",\t\"pubmed_id\":\"string\",\t\"ncbi_taxid\":\"string\",\t\"bao_label\":\"string\",\t\"xrefs\":\"string\",\t\"inorganic_flag\":float,\t\"full_name\":\"string\",\t\"subsets\":\"string\",\t\"molecule_type\":\"string\",\t\"polymer_flag\":float,\t\"natural_product\":float})\r\n", "kg_nodes_df = kg_nodes_df[kg_nodes_df['name'].notna()]\r\n", "kg_nodes_df.drop(['doi', 'assay_type','title','tissue_chembl_id','TDL','in_taxon','assay_type','bao_format','confidence_score','assay_chembl_id','pubmed_id','bao_label'], axis=1)\r\n", "kg_nodes_df['name'] = kg_nodes_df['name'].str.lower()\r\n", "kg_nodes_df['node_type'] = kg_nodes_df.id.str.split(\":\").str[0]\r\n", "\r\n", "kg_edges_df = pd.read_csv('merged-kg_edges_head.tsv', sep='\\t', header=0)\r\n", "kg_edges_df = kg_edges_df.drop(['id','neighborhood','textmining','coexpression', 'cooccurence','edge_key','coexpression_transferred','experiments_transferred','standard_value','textmining_transferred','neighborhood_transferred','fusion','database_transferred','database','num_participants','standard_relation','combined_score','experiments','With','Date','publications'], axis=1)\r\n", "kg_edges_tail_df = pd.read_csv('merged-kg_edges_tail.tsv', sep='\\t')\r\n", "kg_edges_tail_df.columns = columns\r\n", "kg_edges_tail_df = kg_edges_tail_df.drop(['id','neighborhood','textmining','coexpression', 'cooccurence','edge_key','coexpression_transferred','experiments_transferred','standard_value','textmining_transferred','neighborhood_transferred','fusion','database_transferred','database','num_participants','standard_relation','combined_score','experiments','With','Date','publications'], axis=1)\r\n", "kg_edges_df = kg_edges_df.append(kg_edges_tail_df)\r\n", "kg_edges_tail_df = None\r\n", "\r\n", "print(len(kg_nodes_df), len(kg_edges_df))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (12,13) have mixed types.Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n", "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (0,8,23,26,32,38,39,41,46) have mixed types.Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n", "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (0,6,7,8,9,10,12,14,15,18,22,23,24,25,26,32,33,35,36,38,39,41,42,43,46,47,48) have mixed types.Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "307818 21443188\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 425 }, "id": "A0eV37bKcuIB", "outputId": "b581b13e-14d8-4360-cea8-b12d09cabe5f" }, "source": [ "kg_nodes_df.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamecategoryprovided_byTTD_IDiridoititlesame_astissue_chembl_idTDLdescriptionin_taxonassay_typebao_formatconfidence_scoremolecular_formulasynonymassay_chembl_idcanonical_smilespubmed_idncbi_taxidbao_labelxrefsinorganic_flagfull_namesubsetsmolecule_typepolymer_flagnatural_productnode_type
0ENSEMBL:ENSG00000004059arf5biolink:GeneSTRING<NA><NA><NA><NA><NA><NA><NA>ADP ribosylation factor 5NaNNaN<NA><NA><NA><NA><NA><NA><NA><NA><NA>NCBIGene:381NaN<NA><NA><NA>NaNNaNENSEMBL
1ENSEMBL:ENSG00000143933calm2biolink:GeneSTRING<NA><NA><NA><NA><NA><NA><NA>calmodulin 2NaNNaN<NA><NA><NA><NA><NA><NA><NA><NA><NA>NCBIGene:805NaN<NA><NA><NA>NaNNaNENSEMBL
2ENSEMBL:ENSG00000131089arhgef9biolink:GeneSTRING<NA><NA><NA><NA><NA><NA><NA>Cdc42 guanine nucleotide exchange factor 9NaNNaN<NA><NA><NA><NA><NA><NA><NA><NA><NA>NCBIGene:23229NaN<NA><NA><NA>NaNNaNENSEMBL
3ENSEMBL:ENSG00000178607ern1biolink:GeneSTRING<NA><NA><NA><NA><NA><NA><NA>endoplasmic reticulum to nucleus signaling 1NaNNaN<NA><NA><NA><NA><NA><NA><NA><NA><NA>NCBIGene:2081NaN<NA><NA><NA>NaNNaNENSEMBL
4ENSEMBL:ENSG00000147889cdkn2abiolink:GeneSTRING<NA><NA><NA><NA><NA><NA><NA>cyclin dependent kinase inhibitor 2ANaNNaN<NA><NA><NA><NA><NA><NA><NA><NA><NA>NCBIGene:1029NaN<NA><NA><NA>NaNNaNENSEMBL
\n", "
" ], "text/plain": [ " id name ... natural_product node_type\n", "0 ENSEMBL:ENSG00000004059 arf5 ... NaN ENSEMBL\n", "1 ENSEMBL:ENSG00000143933 calm2 ... NaN ENSEMBL\n", "2 ENSEMBL:ENSG00000131089 arhgef9 ... NaN ENSEMBL\n", "3 ENSEMBL:ENSG00000178607 ern1 ... NaN ENSEMBL\n", "4 ENSEMBL:ENSG00000147889 cdkn2a ... NaN ENSEMBL\n", "\n", "[5 rows x 31 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 6 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 218 }, "id": "xED6Or58cxis", "outputId": "ed42160e-53ee-48e1-e269-f3c9491ac6e0" }, "source": [ "kg_edges_df.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectedge_labelobjectrelationprovided_bytarget_typestandard_unitssubjectActivityuo_unitsassay_organismassaytypeECO_codetarget_organismpublicationcommentAnnotation_Propertiesevidencesubj_exp_roleAssigned_byDB_Referencesobj_exp_roledetection_methodobjectActivitystandard_typehomologyassociation_typetarget_pref_name
0ENSEMBL:ENSG00000004059biolink:has_gene_productUniProtKB:P84085RO:0002205NCBINaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1ENSEMBL:ENSG00000143933biolink:has_gene_productUniProtKB:P0DP24RO:0002205NCBINaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2ENSEMBL:ENSG00000131089biolink:has_gene_productUniProtKB:O43307RO:0002205NCBINaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3ENSEMBL:ENSG00000178607biolink:has_gene_productUniProtKB:O75460RO:0002205NCBINaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4ENSEMBL:ENSG00000147889biolink:has_gene_productUniProtKB:P42771RO:0002205NCBINaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " subject ... target_pref_name\n", "0 ENSEMBL:ENSG00000004059 ... NaN\n", "1 ENSEMBL:ENSG00000143933 ... NaN\n", "2 ENSEMBL:ENSG00000131089 ... NaN\n", "3 ENSEMBL:ENSG00000178607 ... NaN\n", "4 ENSEMBL:ENSG00000147889 ... NaN\n", "\n", "[5 rows x 28 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 7 } ] }, { "cell_type": "markdown", "metadata": { "id": "gA5yfYl1c3jJ" }, "source": [ "## Some Helper Functions to generate the data" ] }, { "cell_type": "code", "metadata": { "id": "yBauLFAfc1CN" }, "source": [ "def next_level_relations(edge_node_list, drug_nodes, drug_edges):\r\n", "\r\n", " drug_nodes = kg_nodes_df[kg_nodes_df.id.isin(edge_node_list)]\r\n", "\r\n", " subject_edges = kg_edges_df[kg_edges_df[\"subject\"].isin(edge_node_list)]\r\n", " object_edges = kg_edges_df[kg_edges_df[\"object\"].isin(edge_node_list)]\r\n", " drug_edges = pd.concat([subject_edges, object_edges])\r\n", " #subject_tail_edges = kg_edges_tail_df[kg_edges_tail_df[\"subject\"].isin(edge_node_list)]\r\n", " #object_tail_edges = kg_edges_tail_df[kg_edges_tail_df[\"object\"].isin(edge_node_list)]\r\n", " #drug_edges = pd.concat([subject_edges, object_edges, subject_tail_edges, object_tail_edges])\r\n", "\r\n", " return drug_nodes, drug_edges\r\n", "\r\n", "\r\n", "def create_data_json(term, depth=0, generate_flag=False):\r\n", " drug_nodes = kg_nodes_df[kg_nodes_df.name.str.contains(term.lower())]\r\n", " node_list = list(drug_nodes.id)\r\n", "\r\n", " subject_edges = kg_edges_df[kg_edges_df[\"subject\"].isin(node_list)]\r\n", " object_edges = kg_edges_df[kg_edges_df[\"object\"].isin(node_list)]\r\n", " drug_edges = pd.concat([subject_edges, object_edges])\r\n", " #subject_tail_edges = kg_edges_tail_df[kg_edges_tail_df[\"subject\"].isin(node_list)]\r\n", " #object_tail_edges = kg_edges_tail_df[kg_edges_tail_df[\"object\"].isin(node_list)]\r\n", " #drug_edges = pd.concat([subject_edges, object_edges, subject_tail_edges, object_tail_edges])\r\n", "\r\n", " for depth_iter in range(depth):\r\n", " edge_node_list = list(set(list(drug_edges.subject) + list(drug_edges.object)))\r\n", " drug_nodes, drug_edges = next_level_relations(edge_node_list, drug_nodes, drug_edges)\r\n", "\r\n", " edge_node_list = list(set(list(drug_edges.subject) + list(drug_edges.object)))\r\n", " drug_nodes = kg_nodes_df[kg_nodes_df.id.isin(edge_node_list)]\r\n", "\r\n", " #drug_edges = drug_edges[drug_edges.publication != \"\"]\r\n", " \r\n", " links_df = drug_edges[['subject','edge_label','object','relation','provided_by','association_type','publication','ECO_code']]\r\n", " links_df.columns = ['source', 'value', 'target','relation','provided_by','association_type','publication','ECO_code']\r\n", " edges_df = drug_nodes[[\"id\",\"name\",\"category\",\"provided_by\"]]\r\n", " edges_df.columns = ['id', 'name', 'group', 'provided_by']\r\n", "\r\n", " data = {}\r\n", " data[\"nodes\"] = json.loads(edges_df.to_json(orient=\"records\"))\r\n", " data[\"links\"] = json.loads(links_df.to_json(orient=\"records\"))\r\n", "\r\n", " node_ids = [data[\"nodes\"][i][\"id\"] for i in range(len(data[\"nodes\"]))]\r\n", "\r\n", " trimmed_links = []\r\n", " for i in range(len(data[\"links\"])):\r\n", " if data[\"links\"][i][\"source\"] in node_ids and data[\"links\"][i][\"target\"] in node_ids:\r\n", " trimmed_links.append(data[\"links\"][i])\r\n", " data[\"links\"] = trimmed_links\r\n", "\r\n", " if generate_flag or len(data[\"links\"]) > 100:\r\n", " with open( term + \".json\", \"w\") as outfile:\r\n", " json.dump(data, outfile, indent=4)\r\n", " return len(data[\"nodes\"]), len(data[\"links\"])\r\n", "\r\n", "def generate_bubble_chart_data(data_type, column):\r\n", " if data_type == \"nodes\":\r\n", " data_df = kg_nodes_df[column].value_counts().to_frame('counts')\r\n", " else:\r\n", " data_df = kg_edges_df[column].value_counts().to_frame('counts')\r\n", " #head_data_df = kg_edges_df[column].value_counts().to_frame('counts')\r\n", " #tail_data_df = kg_edges_tail_df[column].value_counts().to_frame('counts')\r\n", " #data_df = pd.concat([head_data_df, tail_data_df])\r\n", " data_df[column] = list(data_df.index)\r\n", " data_df = pd.concat([pd.Series(row['counts'], row[column].split('|')) for _, row in data_df.iterrows()]).reset_index()\r\n", " data_df.columns = [column,\"counts\"]\r\n", " data_df = data_df.groupby(column).sum()\r\n", " data_df[column] = list(data_df.index)\r\n", " data_df.reset_index(drop=True, inplace=True)\r\n", " data_df.columns = [\"value\",\"id\"]\r\n", " data_df.to_csv(\"%s_%s.csv\" % (data_type, column))" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "WBEOKE9idBPL" }, "source": [ "## Samples" ] }, { "cell_type": "code", "metadata": { "id": "NuodfirZdAfA" }, "source": [ "create_data_json(\"ubql1_human\", generate_flag=True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "EuI0-BuIdGnO" }, "source": [ "generate_bubble_chart_data(\"nodes\",\"node_type\")" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4dQobM9fdXBD", "outputId": "7b70615a-f58d-407f-eca6-7aab296329e1" }, "source": [ "!ls -lh" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "total 8.3G\n", "-rw-r--r-- 1 root root 359M Dec 3 22:19 kg-covid-19.tar.gz\n", "-rw-r--r-- 1 root root 359M Dec 3 22:19 kg-covid-19.tar.gz.1\n", "-rw-r--r-- 1 root root 2.0G Dec 10 02:19 merged-kg_edges_head.tsv\n", "-rw-r--r-- 1 root root 1.8G Dec 10 02:19 merged-kg_edges_tail.tsv\n", "-rw-r--r-- 1 114 120 3.8G Dec 3 03:22 merged-kg_edges.tsv\n", "-rw-r--r-- 1 114 120 83M Dec 3 03:01 merged-kg_nodes.tsv\n", "drwxr-xr-x 1 root root 4.0K Dec 2 22:04 sample_data\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VGEXAf-Yduei", "outputId": "a543c3fd-f791-45ae-a166-afaaf900c76a" }, "source": [ "print(len(set(list(kg_edges_df.subject) + list(kg_edges_df.object))), len(list(kg_nodes_df.id)))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "372799 307818\n" ], "name": "stdout" } ] } ] }